Naive similarity function

The most basic similarity function is a comparison of what functions are called in each cell. Primarily, we look at three different possible definitions of function name:

  1. Function call as it appears
  2. Full function call
  3. End of function call

For example, the function call in:

from sklearn import linear_model linear_model.logistic_regression()

Would be written as the following:

  1. linear_model.logistic_regression()
  2. sklearn.linear_model.logistic_regression()
  3. logistic_regression()

Lets take a look at what happens when we take the Jaccard similarity of the function calls that appear in each cell.


In [1]:
# Necessary imports 
import os
import time
from nbminer.features.features import Features
from nbminer.notebook_miner import NotebookMiner
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary

In [2]:
from nbminer.stats.summary import Summary

In [3]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
a = Features([NotebookMiner(n) for n in notebooks])

In [6]:
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.get_simple_features import GetSimpleFeatures
from nbminer.results.similarity.jaccard_similarity import SegmentJaccardSimilarity
# Now lets compute the jaccard similarity between each cell
gaf = GetASTFeatures()
a = gaf.transform(a)
gi = GetImports()
a = gi.transform(a)
sf = GetSimpleFeatures()
a = sf.transform(a)
s = time.time()
segJS = SegmentJaccardSimilarity()
rd, cls = segJS.transform(a)
print ('Time elapsed: ', time.time()-s)


0 / 9366
500 / 9366
1000 / 9366
1500 / 9366
2000 / 9366
2500 / 9366
3000 / 9366
3500 / 9366
4000 / 9366
4500 / 9366
5000 / 9366
5500 / 9366
6000 / 9366
6500 / 9366
7000 / 9366
7500 / 9366
8000 / 9366
8500 / 9366
9000 / 9366
Time elapsed:  2063.6132860183716

Now that we have performed all the jaccard similarity metrics, we can start looking at the results


In [33]:
short_similarities = []
full_similarities = []
call_similarities = []
for key in rd:
    short_similarities.append(rd[key]['short_similarity'])
    full_similarities.append(rd[key]['full_similarity'])
    call_similarities.append(rd[key]['call_similarity'])

In [34]:
import numpy as np
short_similarities = np.array(short_similarities)
full_similarities = np.array(full_similarities)
call_similarities = np.array(call_similarities)

In [35]:
print (np.mean(short_similarities))
print (np.mean(full_similarities))
print (np.mean(call_similarities))


0.0153660049337
0.00375275334356
0.0037363902088

In [36]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [37]:
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(1,3)
axes[0].hist(short_similarities,bins=10)
axes[1].hist(full_similarities,bins=10)
axes[2].hist(call_similarities,bins=10)


Out[37]:
(array([  3.12766790e+07,   1.93062000e+05,   1.01662000e+05,
          4.09540000e+04,   3.74100000e+03,   3.34650000e+04,
          5.62000000e+03,   2.63000000e+02,   6.70000000e+01,
          2.32700000e+04]),
 array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]),
 <a list of 10 Patch objects>)

Looks like we have alot of zeros here, lets see how many exactly, and take them out to get a better look at the rest


In [38]:
greater_than_0 = len([i for i in short_similarities if i > 0])
total_length = len(short_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)


Total length:  31678783
Greater than 0:  1589938
Fraction greater than 0:  0.05018936491341855

In [39]:
greater_than_0 = len([i for i in full_similarities if i > 0])
total_length = len(full_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)


Total length:  31678783
Greater than 0:  550499
Fraction greater than 0:  0.01737752993857119

In [40]:
greater_than_0 = len([i for i in call_similarities if i > 0])
total_length = len(call_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)


Total length:  31678783
Greater than 0:  549272
Fraction greater than 0:  0.017338797390038626

In [41]:
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(1,3)
axes[0].hist(np.array([i for i in short_similarities if i > 0]),bins=10)
axes[1].hist(np.array([i for i in full_similarities if i > 0]),bins=10)
axes[2].hist(np.array([i for i in call_similarities if i > 0]),bins=10)


Out[41]:
(array([  2.08424000e+05,   1.78872000e+05,   5.48050000e+04,
          4.37220000e+04,   3.37080000e+04,   5.21000000e+02,
          5.62000000e+03,   3.09000000e+02,   2.10000000e+01,
          2.32700000e+04]),
 array([ 0.01538462,  0.11384615,  0.21230769,  0.31076923,  0.40923077,
         0.50769231,  0.60615385,  0.70461538,  0.80307692,  0.90153846,  1.        ]),
 <a list of 10 Patch objects>)

Specific Examples from the code

Ok, now that we have an idea of the general trends in the data, lets start looking at specific example code from the notebooks.


In [43]:
# Finding examples of cells that have a higher 'short call similarity' than 'long call similarity'
total_examples = 5
for key in rd:
    if rd[key]['short_similarity'] > rd[key]['full_similarity'] and rd[key]['full_similarity'] > .5:
        if 'head' in rd[key]['code_x'][0] or 'sum' in rd[key]['code_x'][0]: continue
        print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
        print ("CODE X:")
        #if 'get_ipython().magic(' in rd[key]['code_x'][0]:
        #    continue
        for line in rd[key]['code_x'][0].split('\n'):
            print (line)
        print ("\n\nCODE Y:")
        for line in rd[key]['code_y'][0].split('\n'):
            print (line)
        print ("\n\n\n\n\n\n")
        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

"""
Check if there are any words starting with # in the 'text' field.
if it is the case, we add it to a list that we return at the end
"""
def find_hashtags(texts):
    hashtags = []
    for i in range(0,len(texts)):
        if texts[i][0] in '#':
            hashtags.append(texts[i][1:])
    return hashtags




CODE Y:

# coding: utf-8

# In[ ]:

hashtags = []
for i in range(len(df_ethz)):
    hh = df_ethz2['entities'][i]['hashtags']
    if len(hh) > 0:
        txts = []
        for j in hh:
            txts.append(j['text'])
        hashtags.append(txts)
    else:
        hashtags.append(hh)









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

plt.plot(rmses,'b')
plt.title("RMSES / estimators")
plt.xlabel("Number of estimators")
plt.ylabel("RMSE")
plt.show()




CODE Y:

# coding: utf-8

# In[ ]:

eth_reduced.favorite_count.value_counts().plot(kind='bar', figsize=(10,10))
plt.ylabel('Frequency')
plt.xlabel('Value of favorite count')
plt.title('Favorite count for ETH')
plt.show()









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

plt.plot(rmses,'b')
plt.title("RMSES / estimators")
plt.xlabel("Number of estimators")
plt.ylabel("RMSE")
plt.show()




CODE Y:

# coding: utf-8

# In[ ]:

eth_reduced.retweet_count.value_counts().plot(kind='bar', figsize=(10,10))
plt.ylabel('Frequency')
plt.xlabel('Value of retweet count')
plt.title('Retweet count for ETH')
plt.show()









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

plt.plot(rmses,'b')
plt.title("RMSES / estimators")
plt.xlabel("Number of estimators")
plt.ylabel("RMSE")
plt.show()




CODE Y:

# coding: utf-8

# In[ ]:

epfl_reduced.favorite_count.value_counts().plot(kind='bar', figsize=(10,10))
plt.ylabel('Frequency')
plt.xlabel('Value of favorite count')
plt.title('Favorite count for EPFL')
plt.show()









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

plt.plot(rmses,'b')
plt.title("RMSES / estimators")
plt.xlabel("Number of estimators")
plt.ylabel("RMSE")
plt.show()




CODE Y:

# coding: utf-8

# In[ ]:

epfl_reduced.retweet_count.value_counts().plot(kind='bar', figsize=(10,10))
plt.ylabel('Frequency')
plt.xlabel('Value of retweet count')
plt.title('Retweet count for EPFL')
plt.show()









Most similar


In [44]:
# Finding examples of cells that have a high full code similarity
total_examples = 5
for key in rd:
    if rd[key]['full_similarity'] > .5:
        print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
        print ("CODE X:")
        for line in rd[key]['code_x'][0].split('\n'):
            print (line)
        print ("\n\nCODE Y:")
        for line in rd[key]['code_y'][0].split('\n'):
            print (line)
        print ("\n\n\n\n\n\n")
        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize

from string import punctuation
from nltk.corpus import stopwords
import nltk

import json
import seaborn as sns
sns.set_context('notebook')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set_context('notebook')

epfl = pd.read_json('./epfl_en.json')
ethz = pd.read_json('./eth_en.json')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

import pandas as pd 
import json 
import warnings
warnings.filterwarnings('ignore')

get_ipython().magic('matplotlib inline')

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from functools import partial
from scipy.stats import skewtest
import numpy as np









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt










In [45]:
# Finding examples of cells that have a high short code similarity
total_examples = 5
for key in rd:
    if rd[key]['short_similarity'] > .5:
        print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
        print ("CODE X:")
        for line in rd[key]['code_x'][0].split('\n'):
            print (line)
        print ("\n\nCODE Y:")
        for line in rd[key]['code_y'][0].split('\n'):
            print (line)
        print ("\n\n\n\n\n\n")
        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize

from string import punctuation
from nltk.corpus import stopwords
import nltk

import json
import seaborn as sns
sns.set_context('notebook')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set_context('notebook')

epfl = pd.read_json('./epfl_en.json')
ethz = pd.read_json('./eth_en.json')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

import pandas as pd 
import json 
import warnings
warnings.filterwarnings('ignore')

get_ipython().magic('matplotlib inline')

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from functools import partial
from scipy.stats import skewtest
import numpy as np









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')




CODE Y:

# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt










In [46]:
# Finding examples of cells that have a high short code similarity (ignoring get_ipython calls)
total_examples = 5
for key in rd:
    if rd[key]['short_similarity'] > .5:
        if 'get_ipython' in rd[key]['code_x'][0]:
            continue
        print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
        print ("CODE X:")
        for line in rd[key]['code_x'][0].split('\n'):
            print (line)
        print ("\n\nCODE Y:")
        for line in rd[key]['code_y'][0].split('\n'):
            print (line)
        print ("\n\n\n\n\n\n")
        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

#Load the data in pandas Dataframe

df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")




CODE Y:

# coding: utf-8

# In[ ]:

t1 = pd.read_json('epfl_en.json', typ='dataframe')
t2 = pd.read_json('eth_en.json', typ='dataframe')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

#Load the data in pandas Dataframe

df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")




CODE Y:

# coding: utf-8

# In[ ]:

EPFL_en = pd.read_json("epfl_en.json")
ETH_en = pd.read_json("eth_en.json")









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

#Load the data in pandas Dataframe

df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")




CODE Y:

# coding: utf-8

# In[ ]:

epfl = pd.read_json('epfl_en.json')
ethz = pd.read_json('eth_en.json')









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

#Load the data in pandas Dataframe

df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")




CODE Y:

# coding: utf-8

# In[ ]:

epfl_data = pd.read_json("epfl_en.json")
eth_data = pd.read_json("eth_en.json")









LOOKING AT DIFFERENCE BETWEEN CODE X AND Y
CODE X:

# coding: utf-8

# In[ ]:

#Load the data in pandas Dataframe

df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")




CODE Y:

# coding: utf-8

# In[ ]:

epfl = pd.read_json('./epfl_en.json')
epfl.shape










In [51]:
# Finding sets of functions that have a high full code similarity
total_examples = 50
for key in rd:
    if rd[key]['full_similarity'] > .5:
        if 'get_ipython' in rd[key]['code_x'][0]:
            continue
        print ("LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y")
        print ("SET X:")
        print (rd[key]['full_similarity_x'])
        print ("SET Y:")
        print (rd[key]['full_similarity_y'])

        total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']
LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['pandas.read_json', 'pandas.read_json']
SET Y:
['pandas.read_json', 'pandas.read_json']

In [54]:
# I noticed that some of the cells were actually really similar, Let's zero in on some of the linear regression ones:
total_examples = 5
for key in rd:
    if rd[key]['full_similarity'] > .5:
        if 'sklearn.linear_model.LinearRegression' in rd[key]['full_similarity_x']:
            print ("LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y")
            print ("SET X:")
            print (rd[key]['full_similarity_x'])
            for line in rd[key]['code_x'][0].split('\n'):
                print (line)
            print ("SET Y:")
            print (rd[key]['full_similarity_y'])
            for line in rd[key]['code_y'][0].split('\n'):
                print (line)
            print ('\n\n\n\n\n\n')
            total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

regr = linear_model.LinearRegression()


SET Y:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

from sklearn.model_selection import train_test_split
reg=LinearRegression()









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

regr = linear_model.LinearRegression()


SET Y:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

reg2=LinearRegression()









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

regr = linear_model.LinearRegression()


SET Y:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

reg=LinearRegression()









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

regr = linear_model.LinearRegression()


SET Y:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression()









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

regr = linear_model.LinearRegression()


SET Y:
['sklearn.linear_model.LinearRegression']

# coding: utf-8

# In[ ]:

r = LinearRegression(normalize=True)










In [55]:
# Pretty short, lets look for similar cells with a high number of functions
# I noticed that some of the cells were actually really similar, Let's zero in on some of the linear regression ones:
total_examples = 5
for key in rd:
    if rd[key]['full_similarity'] > .5:
        if len(rd[key]['full_similarity_x']) > 10:
            print ("LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y")
            print ("SET X:")
            print (rd[key]['full_similarity_x'])
            for line in rd[key]['code_x'][0].split('\n'):
                print (line)
            print ("SET Y:")
            print (rd[key]['full_similarity_y'])
            for line in rd[key]['code_y'][0].split('\n'):
                print (line)
            print ('\n\n\n\n\n\n')
            total_examples -= 1
    if total_examples == 0:
        break


LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate']

# coding: utf-8

# In[ ]:

print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()


SET Y:
['matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend']

# coding: utf-8

# In[ ]:

plt.figure
plt.plot(grouped_by_year_eth.favorite_count, '-', color="b", label="eth")
plt.plot(grouped_by_year_epfl.favorite_count, '-', color="r", label=" epfl")

plt.xlabel('years')
plt.ylabel('number of favorites')
plt.legend(loc="best")









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate']

# coding: utf-8

# In[ ]:

print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()


SET Y:
['matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend']

# coding: utf-8

# In[ ]:

plt.figure
plt.plot(grouped_by_year_eth.retweet_count, '-', color="b", label="eth")
plt.plot(grouped_by_year_epfl.retweet_count, '-', color="r", label=" epfl")
plt.xlabel('years')
plt.ylabel('number of retweets')
plt.legend(loc="best")









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate']

# coding: utf-8

# In[ ]:

print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()


SET Y:
['matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend']

# coding: utf-8

# In[ ]:

plt.figure
plt.plot(grouped_by_month_eth.retweet_count, '-', color="b", label="eth")
plt.plot(grouped_by_month_epfl.retweet_count, '-', color="r", label=" epfl")
plt.xlabel('month')
plt.ylabel('number of retweets')
plt.legend(loc="best")









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate']

# coding: utf-8

# In[ ]:

print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()


SET Y:
['matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend']

# coding: utf-8

# In[ ]:

plt.figure
plt.plot(grouped_by_month_eth.favorite_count, '-', color="b", label="eth")
plt.plot(grouped_by_month_epfl.favorite_count, '-', color="r", label=" epfl")
plt.xlabel('month')
plt.ylabel('number of favorites')
plt.legend(loc="best")









LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y
SET X:
['print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'print', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend', 'matplotlib.pyplot.show', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate', 'group_by_and_aggregate']

# coding: utf-8

# In[ ]:

print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()


SET Y:
['matplotlib.pyplot.plot', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel', 'matplotlib.pyplot.legend']

# coding: utf-8

# In[ ]:

plt.figure
plt.plot(grouped_by_hour_eth.retweet_count, '-', color="b", label="eth")
plt.plot(grouped_by_hour_epfl.retweet_count, '-', color="r", label=" epfl")
plt.xlabel('hours')
plt.ylabel('number of retweets')
plt.legend(loc="best")










In [ ]: